# Install and import packages
# Install skimr package for the first time -> install.packages("skimr")
library(ggplot2)
library(dplyr)
library(skimr)
library(plotly)
# import data set and save it in the environment
vote_data <- read.csv(
"C:/Users/tobia/Desktop/University/University of St. Gallen/Semester 4/2. Data Analytics 2/R - Exercises/Data/VOTE.csv")
# Using skimr package to get an overview of the data
skim(vote_data)
| Name | vote_data |
| Number of rows | 173 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| state | 0 | 1 | 2 | 2 | 0 | 44 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| X | 0 | 1 | 87.99 | 50.09 | 1.00 | 45.00 | 88.00 | 131.00 | 174.00 | ▇▇▇▇▇ |
| district | 0 | 1 | 8.86 | 8.75 | 1.00 | 3.00 | 6.00 | 11.00 | 42.00 | ▇▂▁▁▁ |
| democA | 0 | 1 | 0.55 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| voteA | 0 | 1 | 51.24 | 19.80 | 16.00 | 36.00 | 50.00 | 65.00 | 189.00 | ▇▇▁▁▁ |
| expendA | 0 | 1 | 307.21 | 280.70 | 0.30 | 60.67 | 241.45 | 453.46 | 1470.67 | ▇▅▂▁▁ |
| expendB | 0 | 1 | 304.13 | 306.23 | 0.93 | 60.05 | 221.53 | 450.72 | 1548.19 | ▇▃▂▁▁ |
| prtystrA | 0 | 1 | 49.67 | 9.96 | 22.00 | 44.00 | 50.00 | 56.00 | 71.00 | ▁▃▇▆▃ |
| shareA | 0 | 1 | 50.80 | 33.60 | 0.09 | 18.54 | 50.82 | 84.26 | 99.50 | ▇▅▅▅▇ |
# Removing the district RU from the data frame as it does not represent accurate data -> Over 100% of the votes
vote_data <- vote_data[-173,]
# Additio
# Generating three new variables in the data frame
vote_data <- vote_data %>%
mutate(expenddiff = expendB - expendA,
expendA2 = expendA ^ 2,
expendB2 = expendB ^ 2)
head(vote_data)
## X state district democA voteA expendA expendB prtystrA shareA expenddiff
## 1 1 AL 7 1 68 328.296 8.737 41 97.40767 -319.55899
## 2 3 AZ 2 1 73 99.607 3.065 55 97.01476 -96.54200
## 3 4 AZ 3 0 69 319.690 26.281 64 92.40370 -293.40900
## 4 5 AR 3 0 75 159.221 60.054 66 72.61247 -99.16699
## 5 6 AR 4 1 69 570.155 21.393 46 96.38355 -548.76203
## 6 7 CA 2 0 59 696.748 193.915 58 78.22802 -502.83299
## expendA2 expendB2
## 1 107778.257 76.335177
## 2 9921.555 9.394225
## 3 102201.698 690.690968
## 4 25351.325 3606.483019
## 5 325076.757 457.660434
## 6 485457.756 37603.024621
Here write your comment:
The available data seems to be missing values. For example only 43
states out of 50 are represented, and the districts within each state
also seem to be limited as not every single one is shown here.
Descriptive statistics. Answer the following questions.
(i) Generate two scatter plots showing the relationship between voteA on the vertical axis and expendA, or expendB, on the horizontal axis. Make sure that the plots also contain a line showing the linear relationship of the variables. Interpret their slope.
# HERE: Create your scatterdiagram
ggplot(vote_data, aes(expendA, voteA, color = factor(democA))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "black") +
scale_x_log10() +
labs(title = "Democratic campaign spending vs. % of votes",
x = "Campaign spending for democratic party, 1000s",
y = "Percentage of votes for democratic party",
color = "Rep vs. Dem") +
scale_color_manual(values = c("red", "blue")) +
geom_abline(intercept = 50, slope = 0, color = "black", linetype = "dashed") +
ylim(0, 100)
modelA <- lm(voteA ~ expendA, data = vote_data)
summary(modelA)
##
## Call:
## lm(formula = voteA ~ expendA, data = vote_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.751 -12.403 -4.326 15.945 32.714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.177094 1.760267 24.529 < 2e-16 ***
## expendA 0.023509 0.004223 5.566 9.97e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.51 on 170 degrees of freedom
## Multiple R-squared: 0.1542, Adjusted R-squared: 0.1492
## F-statistic: 30.98 on 1 and 170 DF, p-value: 9.969e-08
Here: write your comment:
Positive linear correlation between expendA and voteA. Intercept at
43.18 and slope at 0.02351, indicating voteA rises with expendA. Low r^2
suggests 15% variance explained. Democratic campaign targets Republican
states with blue or red districts. Color scheme shows republican states
with blue/red district is a focus for democratic campaign spending.
# HERE: Create your scatterdiagram
ggplot(vote_data, aes(expendB, 100 - voteA, color = factor(democA))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "black") +
scale_x_log10() +
labs(title = "Republican campaign spending vs. % of votes",
x = "Campaign spending for republican party, 1000s",
y = "Percentage of votes for republican party",
color = "Rep vs. Dem") +
scale_color_manual(values = c("red", "blue")) +
geom_abline(intercept = 50, slope = 0, color = "black", linetype = "dashed") +
ylim(0,100)
modelB <- lm(100 - voteA ~ expendB, data = vote_data)
summary(modelB)
##
## Call:
## lm(formula = 100 - voteA ~ expendB, data = vote_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.243 -11.643 -4.553 13.866 37.525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.541617 1.644411 25.870 < 2e-16 ***
## expendB 0.023060 0.003808 6.056 8.7e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.29 on 170 degrees of freedom
## Multiple R-squared: 0.1774, Adjusted R-squared: 0.1726
## F-statistic: 36.67 on 1 and 170 DF, p-value: 8.701e-09
Here: write your comment:
Republican expenditure (expendB) positively correlates with Republican
votes (100 - voteA). Interception at 42.54, slope at 0.02310, with
R-Squared at 18%. No significant difference between Democrats and
Republicans. Republicans spend less on blue states/districts, focusing
on already “red” ones, and those below 40% votes.
(iv) A political campaign manager would like to know by how much the election results of her own party would improve if she invested 10,000$ extra funding in her political campaign. How would you help her, and what is your answer?
# Here: write your code
# Filtering data and selecting NE district 2
pick_district <- vote_data %>%
filter(voteA < 50 & voteA > 45, democA == 0, state == "NE") %>%
mutate(add_expendA = expendA + 10)
pick_district
## X state district democA voteA expendA expendB prtystrA shareA expenddiff
## 1 101 NE 2 0 49 1158.294 858.762 58 57.42498 -299.5319
## expendA2 expendB2 add_expendA
## 1 1341645 737472.2 1168.294
Here: write your comment:
Filtering data reveals a potential swing district (District 2, NE) at
49% in a Republican state. A $10,000 investment may improve it, but the
slope (0.023) suggests a minor increase to 49.23%. More investment
recommended for significant impact on the district.
(v) The same political campaign manager is from Texas. She is concerned that your results do not account for the particular electoral situation in Texas. What would you do to advise her? What is your answer? What are potential limitations of your answer?
# Here: write your code
# Creating a dataframe by state aggregate state values from combining the district values
state_data <- vote_data %>%
group_by(state) %>%
summarize(count = n(),
avg_state_vote = sum(voteA) / count,
state_shareA = sum(shareA) / count,
state_expendA = sum(expendA),
state_expendB = sum(expendB))
# Putting these values into a plot and displaying the difference between the two parties
plt <- ggplot(state_data, aes(state, avg_state_vote)) +
geom_col(fill = ifelse(state_data$avg_state_vote >= 50, "blue", "red")) +
theme_light() +
labs(title = "Blue vs. Red States",
y = "State vote averaged",
x = "States") +
ylim(0,100) +
geom_abline(intercept = 50, slope = 0, color = "black", linetype = "dashed")
ggplotly(plt)
# Another graoh that shows the individual additional 10k investment
Texas <- vote_data[vote_data$state == "TX",]
# Now plot a graph to visualize it
texas10k <- 10
Texas$expendA[4] <- Texas$expendA[4] + texas10k
Texas_added10k <- Texas
#Here we calculated the new shareA
Texas_added10k$shareA[4] <- 55.82588
ggplot(Texas_added10k, aes(expendA, voteA, color = factor(democA))) +
geom_point() +
geom_text(aes(label = district), hjust = 0, vjust = -1) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
scale_x_log10() +
labs(title = "Effect on district 13 after 10K additional investment",
x = "Campaign spending for democratic party, 1000s",
y = "Percentage of votes for democratic party",
color = "Rep vs. Dem") +
scale_color_manual(values = c("red", "blue")) +
geom_abline(intercept = 50, slope = 0, color = "black", linetype = "dotted") +
ylim(0,100)
Here: write your comment:
There are only 43 states in the dataset, there should be 50. Not all
districts within a state are listed. Therefore our findings are limited
by the dataset available and would have to be catious in advising any
extra spending, as Texas is an an important state in the elections.